suppressPackageStartupMessages({
library(DESeq2)
library(dplyr)
library(gplots)
library(ggplot2)
library(here)
library(hyperSpec)
library(limma)
library(parallel)
library(plotly)
library(tibble)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
load(here("analysis/salmon/ZE-29Seed-dds.rda"))
levels(dds.29z$Experiment) <- c("ZE","Seed")
vsd <- varianceStabilizingTransformation(dds.29z,blind=FALSE)
mat <- assay(vsd)
We select FMG and ZE from mature seeds (29Seed), that correspond to Stage B8 and B9 of the ZE
sel <- dds.29z$Experiment == "Seed" | dds.29z$Time %in% c("B8","B9")
mat <- mat[,sel]
batch = dds.29z$Experiment[sel]
contrasts(batch) <- contr.sum(levels(batch))
design <- model.matrix(~batch)
fit <- lmFit(mat, design)
beta is the actual batch correction for each gene
beta <- fit$coefficients[, -1, drop = FALSE]
beta[is.na(beta)] <- 0
cross-validation with the removeBatchEffect function
stopifnot(all((mat - beta %*% t(design[,-1,drop=FALSE]))[1:6,1:6] ==
removeBatchEffect(mat,dds.29z$Experiment[sel])[1:6,1:6]))
load(here("analysis/salmon/ZE-SE-dds.rda"))
levels(dds.sz$Experiment) <- c("ZE","SE")
vsd <-varianceStabilizingTransformation(dds.sz,blind=FALSE)
fullbatch <- vsd$Experiment
contrasts(fullbatch) <- contr.sum(levels(fullbatch))
fullbatch <- model.matrix(~fullbatch)[, -1, drop = FALSE]
assay(vsd) <- assay(vsd) - beta %*% t(fullbatch)
pc <- prcomp(t(assay(vsd)))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=2
An the number of possible combinations
nlevel=nlevels(vsd$Tissue) * nlevels(vsd$Time)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(vsd)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise - the filtering is meant to restrict the number of genes to be visualised to a reasonable amount that can be viewed in a limited amount of time
conds <- factor(paste(vsd$Time,vsd$Tissue))
sels <- rangeFeatureSelect(counts=assay(vsd),
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
vst.cutoff <- 6
hm <- heatmap.2(t(scale(t(assay(vsd)[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds,cex=0.8)
dir.create(here("analysis/batchCorrection"),showWarnings=FALSE)
save(vsd,file=here("analysis/batchCorrection/vsd.rda"))
Based on the visualisation above, we focus on ZE Stage 3-9 and SE Stage 3-6
sample.sel <- (vsd$Time %in% paste0("B",3:6) & vsd$Tissue =="SE") |
(vsd$Tissue=="ZE" & vsd$Time %in% paste0("B",3:9))
vsd <- vsd[,sample.sel]
vsd$Time <- droplevels(vsd$Time)
pc <- prcomp(t(assay(vsd)))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=2
An the number of possible combinations
nlevel=nlevels(vsd$Tissue) * nlevels(vsd$Time)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(vsd)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
conds <- factor(paste(vsd$Time,vsd$Tissue))
sels <- rangeFeatureSelect(counts=assay(vsd),
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
vst.cutoff <- 4
hc <- hclust(pearson.dist(scale(t(assay(vsd)[sels[[vst.cutoff+1]],]))),
method="ward.D2")
plot(hc,xlab="",sub="",labels=conds,cex=0.8)
The batch correction worked very nicely and while the “Tissue” Se vs. ZE still explains the variance in the first principal component, the second component explains the developmental series and is in agreement with the expectations, especially with regards to the maturation (SE, B3,4,5, ZE B3-B6) and desiccation (SE, B6 - ZE B7-9) time points.
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] tibble_3.0.4 plotly_4.9.2.1
## [3] limma_3.46.0 hyperSpec_0.99-20201127
## [5] xml2_1.3.2 lattice_0.20-41
## [7] here_1.0.0 ggplot2_3.3.2
## [9] gplots_3.1.1 dplyr_1.0.2
## [11] DESeq2_1.30.0 SummarizedExperiment_1.20.0
## [13] Biobase_2.50.0 MatrixGenerics_1.2.0
## [15] matrixStats_0.57.0 GenomicRanges_1.42.0
## [17] GenomeInfoDb_1.26.1 IRanges_2.24.0
## [19] S4Vectors_0.28.0 BiocGenerics_0.36.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_4.0.5 RColorBrewer_1.1-2
## [4] httr_1.4.2 rprojroot_2.0.2 tools_4.0.3
## [7] R6_2.5.0 KernSmooth_2.23-18 DBI_1.1.0
## [10] lazyeval_0.2.2 colorspace_2.0-0 withr_2.3.0
## [13] tidyselect_1.1.0 bit_4.0.4 compiler_4.0.3
## [16] Cairo_1.5-12.2 DelayedArray_0.16.0 labeling_0.4.2
## [19] caTools_1.18.0 scales_1.1.1 genefilter_1.72.0
## [22] stringr_1.4.0 digest_0.6.27 rmarkdown_2.5
## [25] XVector_0.30.0 jpeg_0.1-8.1 pkgconfig_2.0.3
## [28] htmltools_0.5.0 highr_0.8 htmlwidgets_1.5.2
## [31] rlang_0.4.9 RSQLite_2.2.1 generics_0.1.0
## [34] farver_2.0.3 jsonlite_1.7.1 crosstalk_1.1.0.1
## [37] BiocParallel_1.24.1 gtools_3.8.2 RCurl_1.98-1.2
## [40] magrittr_2.0.1 GenomeInfoDbData_1.2.4 Matrix_1.2-18
## [43] Rcpp_1.0.5 munsell_0.5.0 lifecycle_0.2.0
## [46] stringi_1.5.3 yaml_2.2.1 zlibbioc_1.36.0
## [49] blob_1.2.1 crayon_1.3.4 splines_4.0.3
## [52] annotate_1.68.0 locfit_1.5-9.4 knitr_1.30
## [55] pillar_1.4.7 geneplotter_1.68.0 XML_3.99-0.5
## [58] glue_1.4.2 evaluate_0.14 latticeExtra_0.6-29
## [61] data.table_1.13.4 vctrs_0.3.5 png_0.1-7
## [64] testthat_3.0.0 gtable_0.3.0 purrr_0.3.4
## [67] tidyr_1.1.2 xfun_0.19 xtable_1.8-4
## [70] survival_3.2-7 viridisLite_0.3.0 AnnotationDbi_1.52.0
## [73] memoise_1.1.0 ellipsis_0.3.1